\documentclass[final,OA]{AAAI-Std}

\def\artid{0001}%

\newcommand{\argmin}[1]{\underset{#1}{\operatorname{arg}\!\operatorname{min}}\;}

\begin{document}

\title{C\textsc{onsistent} P\textsc{rocedures} \textsc{for} M\textsc{ulticlass} C\textsc{lassification} \textsc{of} D\textsc{iscrete} D\textsc{iffusion} P\textsc{aths}}

\author{C\textsc{hristophe} D\textsc{enis}}
\author{C\textsc{harlotte} D\textsc{ion}}
\author{M\textsc{iguel} M\textsc{artinez}}
\author{C\textsc{harlotte} D\textsc{ion}}
\author{M\textsc{iguel} M\textsc{artinez}}

\address{Eugene Frimpong is a Graduate student in the Coastal Resources Management PhD Program at the East Carolina University. Daniel R. Petrolia and Ardian Harri are Professors in the Department of Agricultural Economics at the Mississippi State University. John H. Cartwright is a Research associate in the Geosystems Research Institute at the Mississippi State University.}

\abstract[Abstract]{The recent advent of modern technology has generated a large number of datasets which
can be frequently modeled as functional data.
This paper focuses on the problem of multiclass classification for stochastic diffusion paths.
In this context we establish a closed formula for the optimal Bayes rule.
We provide new statistical procedures which are built either on the \textit{plug-in} principle or on the empirical risk minimization principle.
We show the consistency of these procedures under mild conditions. We apply our methodologies to the parametric case and
illustrate their accuracy with a simulation study through examples.}

\keywords{diffusion paths, drift estimation, multiclass
classification, plug-in estimators}

\maketitle

\section{Sample for First Level Head}\label{sec:intro}

In the multiclass classification framework, it is assumed that we have at our disposal a learning sample of observations that consists of $N$ independent realizations of $(X,Y)$ with the feature $X \in \mathcal{X}$ and the label $Y\in \{1, \dots, K\}$ constructed on some probability space
$(\Omega, \mathcal{F}, \mathbb{P})$.
For a new observation $X$ the goal is to predict the associated unobserved label $Y$.
This is done through a \textit{classifier} ${g} : \mathcal{X} \rightarrow \{1, \dots, K\}$. The misclassification risk of $g$ is
$\mathbb{P}( {g}(X) \neq Y)$.
The accuracy of the classifier is then evaluated by comparison with the Bayes classifier $g^{*}$. For $x \in \mathcal{X}$, $g^{*}(x)$ is defined as the maximizer over $\{1, \dots, K \}$ of the conditional probabilities $\mathbb{P}(Y=k| X= x)$. Moreover, the Bayes classifier minimizes the misclassification error over the set of all classifiers \cite{park2002,hu2017tps3157}. Therefore, the performance of an arbitrary classifier $g$ is measured by considering the \textit{excess risk} $\mathbb{P}( {g}(X) \neq Y)-\mathbb{P}( {g^{*}}(X) \neq Y)$.
In statistical learning, the joint distribution of $(X,Y)$ is unknown. Consequently, based on the learning sample, the objective is to build an empirical classifier $\widehat{g}$ such that the expectation of its excess risk tends to zero as $N$ tends to infinity (\textit{consistency}).

\begin{table*}[!t]
\processtable{Average and standard deviation of the misclassification error rate for the three procedures with $n=50$.\label{tab:error1}}{%
\begin{tabular}{ccccccc}
\toprule
& \multicolumn{2}{c}{\texttt{MLE}} & \multicolumn{2}{c}{\texttt{CM}} &\multicolumn{2}{c}{\texttt{OVA}}\\
\midrule
Model 1& 0.32 (0.02) & 0.31 (0.01) & 0.34 (0.04)& 0.31 (0.01) & 0.33 (0.05) & 0.31 (0.01)\\
Model 2 & 0.12 (0.01) & 0.12 (0.01) & 0.13 (0.02)& 0.12 (0.01) & 0.13 (0.02) & 0.12 (0.01) \\
Model 3 &0.67 (0.02) & 0.67 (0.01) & 0.67 (0.01)& 0.67 (0.01) & 0.66 (0.02) & 0.66 (0.01) \\
Model 4 & 0.34 (0.01) & 0.33 (0.01) & 0.36 (0.04) & 0.33 (0.01) & 0.35 (0.03) & 0.33 (0.01) \\
\botrule
\end{tabular}}{\textit{Note:} Based on an estimator $\widehat{b}$ of $b$, we consider an estimator of the conditional probabilities.
Then, for each estimator $\widehat{b}$, we consider the empirical classifier $\widehat{g} := g_{\widehat{b}}$ defined as the maximizer of the estimated conditional probabilities.}
  \end{table*}

\vspace*{-12pt}
\subsection{Example for Second Level Head}

The classification problem for diffusion sample paths may be regarded as a particular case of functional data analysis problems. Many methods have been developed to solve such problems in general.
Among all these methodologies, we may mention $k$-nearest neighbors in Hilbert spaces that could be applied to our classification problem.
There is also some recent developments for related problems such as functional random forests, functional principal component analysis, kernel estimators, just to mention a few of them.
 Recent works on depth classification for functional data propose various elegant computational solutions.

Finally, we recall that the procedure \texttt{MLE} relies on estimators of the design $\theta^{*}$ for which we provide consistency and asymptotic normality in Section~1. Hereafter, we briefly evaluate these properties.
Considering $\theta^{*} = 1$ for model~1, $\theta^{*} = 3$ for model~2, $\theta^{*} = 1/2$ for model~3, and $\theta^{*} = 2$ for model~4, we evaluate the empirical quadratic risks of the estimator on $B=300$ repetitions.
A dataset consists of $N \in \{50,100,1000\}$ trajectories composed of $n = 250$ points. The results are shown in Table~1. 

As expected, the evaluation of the quadratic risk (given $\times 10^{2}$) are increasingly closed to 0 with respect to $N$. At the same time, one can see that the estimates have relatively poor performances for small $N$ especially for model~1. We illustrate in Figure~1 the constraint $N/n= o(1)$ required for the asymptotic normality.
The variance decreases with $N$ and the bias decreases with $n$. As expected, the best situation is to have both parameters large enough. Lastly, Figure~2 gives an illustration of Theorem~1. One can see that the empirical distribution functions of $\sqrt{N}\left(\widehat{\theta} - \theta\right)$ are closed to the Gaussian
distribution function of the theoretical limit.

\begin{figure}
\centerline{\includegraphics{figure2.eps}}
\caption{Temporal change of internal oxygen concentration in rice leaves during fungal infection. Internal oxygen contents were measured using VisiSens USB microscope. Mock and INF correspond
to rice leaf inoculated with water and M. oryzae respectively. Significance was determined by One-way ANOVA with Tukeys HSD test. Red colour indicates significant differences.}
\end{figure}

\begin{figure*}
\centerline{\includegraphics{figure1.eps}}
\caption{Temporal change of internal oxygen concentration in rice leaves during fungal infection. Internal oxygen contents were measured using ‘VisiSens’ USB microscope. Mock and INF correspond
to rice leaf inoculated with water and M. oryzae respectively. Significance was determined by One-way ANOVA with Tukey’s HSD test. Red colour indicates significant differences.}
\end{figure*}

A substantial part of the paper is devoted to the study of the parametric case. We study minimum contrast estimators of the parameters that rule the drift and show their consistency and asymptotic normality. The resulting plug-in classification procedure is then shown to be consistent.
Furthermore, we propose to use a convex version of the empirical risk minimizer which involves convex surrogates of the misclassification risks. We present here two new easily implementable classifiers and prove their consistency.

In comparison\footnote{FEMA adjusts all premium rates upward to offset income lost as a result of the discounts. If community mitigation efforts reduce claims, then the premium adjustment is larger than needed and yields additional income to the NFIP (CBO2017).} to the present work brings three main extensions.
The first one is the generalization of the binary missclassification problem for diffusion paths to the corresponding multiclass classification problem.
The second one is the discrete setting of our framework. Closer to reality, we assume that the data collected are recorded at discrete times. This introduces an additional error term due to the time step and we give the order of this additional error in the rates of convergence.
Thirdly, in the parametric setting, we exhibit procedures that are easily implementable.
 We present convincing numerical results on some classical examples.

We provide a closed formula for the optimal Bayes classifier which yields an explicit representation for the excess risk of a general classifier.
Thus, the relation between the conditional probabilities $\mathbb{P}(Y=k | X)$ and the vector $b$ of unknown drift functions is fully explicit.
Our strategy relies on the plug-in principle. Based on an estimator $\widehat{b}$ of $b$, we consider an estimator of the conditional probabilities.
Then, for each estimator $\widehat{b}$, we consider the empirical classifier $\widehat{g} := g_{\widehat{b}}$ defined as the maximizer of the estimated conditional probabilities.
The major part of the paper is then devoted to show that plug-in classification procedures derived from drift coefficient estimators are indeed consistent.
In particular we first exhibit a sufficient condition on the estimator $\widehat{b}$ which ensures the consistency of the resulting procedure.
Secondly, we construct an estimator based on the minimization of the empirical risk over the learning sample.
We show the consistency of this new procedure. Under mild assumptions, we show that the rate of convergence is comparable to the one obtained but in the multiclass context with discrete observations.

A substantial part of the paper is devoted to the study of the parametric case. We study minimum contrast estimators of the parameters that rule the drift and show their consistency and asymptotic normality. The resulting plug-in classification procedure is then shown to be consistent.
Furthermore, we propose to use a convex version of the empirical risk minimizer which involves convex surrogates of the misclassification risks. We present here two new easily implementable classifiers and prove their consistency.

In comparison to \citep{park88}, the present work brings three main extensions.
The first one is the generalization of the binary missclassification problem for diffusion paths to the corresponding multiclass classification problem.
The second one is the discrete setting of our framework. Closer to reality, we assume that the data collected are recorded at discrete times. This introduces an additional error term due to the time step and we give the order of this additional error in the rates of convergence.
Thirdly, in the parametric setting, we exhibit procedures that are easily implementable.
 We present convincing numerical results on some classical examples.

We provide a closed formula for the optimal Bayes classifier which yields an explicit representation for the excess risk of a general classifier.
Thus, the relation between the conditional probabilities $\mathbb{P}(Y=k | X)$ and the vector $b$ of unknown drift functions is fully explicit.
Our strategy relies on the plug-in principle. Based on an estimator $\widehat{b}$ of $b$, we consider an estimator of the conditional probabilities.
Then, for each estimator $\widehat{b}$, we consider the empirical classifier $\widehat{g} := g_{\widehat{b}}$ defined as the maximizer of the estimated conditional probabilities.
The major part of the paper is then devoted to show that plug-in classification procedures derived from drift coefficient estimators are indeed consistent.
In particular we first exhibit a sufficient condition on the estimator $\widehat{b}$ which ensures the consistency of the resulting procedure.
Secondly, we construct an estimator based on the minimization of the empirical risk over the learning sample.
We show the consistency of this new procedure. Under mild assumptions, we show that the rate of convergence is comparable to the one obtained but in the multiclass context with discrete observations.

A substantial part of the paper is devoted to the study of the parametric case. We study minimum contrast estimators of the parameters that rule the drift and show their consistency and asymptotic normality. The resulting plug-in classification procedure is then shown to be consistent.
Furthermore, we propose to use a convex version of the empirical risk minimizer which involves convex surrogates of the misclassification risks. We present here two new easily implementable classifiers and prove their consistency.

In comparison, the present work brings three main extensions.
The first one is the generalization of the binary missclassification problem for diffusion paths to the corresponding multiclass classification problem.
The second one is the discrete setting of our framework. Closer to reality, we assume that the data collected are recorded at discrete times. This introduces an additional error term due to the time step and we give the order of this additional error in the rates of convergence.
Thirdly, in the parametric setting, we exhibit procedures that are easily implementable.
 We present convincing numerical results on some classical examples.

\begin{table}
\processtable{{Average and standard deviation of the misclassification error rate for the Bayes classifier with $n=2500$. \label{table:bayesRisk}}}{%
\tabcolsep=50pt\begin{tabular}{@{}cc@{}}
\toprule
& \texttt{Bayes rule}\\
\midrule
Model 1& 0.31 (0.002) \\
Model 2 & 0.12 (0.003) \\
Model 3 & 0.22 (0.003) \\
Model 4 & 0.33 (0.004)\\
\botrule
\end{tabular}}{\textit{Note:} Based on an estimator $\widehat{b}$ of $b$, we consider an estimator of the conditional probabilities.
Then, for each estimator $\widehat{b}$, we consider the empirical classifier $\widehat{g} := g_{\widehat{b}}$ defined as the maximizer of the estimated conditional probabilities.}
 \end{table}



We provide a closed formula for the optimal Bayes classifier which yields an explicit representation for the excess risk of a general classifier.
Thus, the relation between the conditional probabilities $\mathbb{P}(Y=k | X)$ and the vector $b$ of unknown drift functions is fully explicit.
Our strategy relies on the plug-in principle. Based on an estimator $\widehat{b}$ of $b$, we consider an estimator of the conditional probabilities.
Then, for each estimator $\widehat{b}$, we consider the empirical classifier $\widehat{g} := g_{\widehat{b}}$ defined as the maximizer of the estimated conditional probabilities.
The major part of the paper is then devoted to show that plug-in classification procedures derived from drift coefficient estimators are indeed consistent.
In particular we first exhibit a sufficient condition on the estimator $\widehat{b}$ which ensures the consistency of the resulting procedure.
Secondly, we construct an estimator based on the minimization of the empirical risk over the learning sample.
We show the consistency of this new procedure. Under mild assumptions, we show that the rate of convergence is comparable to the one obtained in the multiclass context with discrete observations.

A substantial part of the paper is devoted to the study of the parametric case. We study minimum contrast estimators of the parameters that rule the drift and show their consistency and asymptotic normality. The resulting plug-in classification procedure is then shown to be consistent.
Furthermore, we propose to use a convex version of the empirical risk minimizer which involves convex surrogates of the misclassification risks. We present here two new easily implementable classifiers and prove their consistency.

In comparison, the present work brings three main extensions.
The first one is the generalization of the binary missclassification problem for diffusion paths to the corresponding multiclass classification problem.
The second one is the discrete setting of our framework. Closer to reality, we assume that the data collected are recorded at discrete times. This introduces an additional error term due to the time step and we give the order of this additional error in the rates of convergence.
Thirdly, in the parametric setting, we exhibit procedures that are easily implementable.
 We present convincing numerical results on some classical examples.

The resulting sets of score functions are bounded. Indeed, for each $\theta \in {\Theta}$ and $i \in \mathcal{Y}$,
\begin{equation*}
|\overline{h}_{\theta}^{\varepsilon,i}(\overline{X})| \leq \log\left(\frac{1}{\varepsilon}\right).
\end{equation*}

We are now able to define the empirical risk threshold minimizer
\begin{equation}\label{eq:thetariskmin}
\widehat{\theta} \in \argmin{\theta \in {\Theta^{K}}}{\widehat{R}_{\phi}(\overline{h}_{\theta}^{\varepsilon})},
\end{equation}
with $\widehat{R}_{\phi}$ given in Eq.~1. Note that $\widehat{\theta}$ depends on $\varepsilon$ but for sake of simplicity the dependency will only appears on the notation of the scores $h,\overline{h}$.
The following proposition establishes the consistency of the corresponding classification strategy with respect to the $\phi$-risk of $\overline{h}_{\widehat{\theta}}^{\varepsilon}$.

\medskip
\begin{proposition}
\label{prop:consistencyPhiRisk}
Assume that $\Theta = [0,1]^{d}$ and that there exists $\alpha > 0$ such that $\Delta = O(N^{-\alpha})$. Under Assumption~1, if
$\varepsilon = O\left(N^{-\beta}\right)$ with $0 < \beta < \min(1/2, \alpha/4)$ then the classification procedure $\overline{h}^{\varepsilon}_{\widehat{\theta}}$ given by satisfies,
\begin{equation*}
\widehat{\textbf{E}}\left[R_{\phi}(\overline{h}^{\varepsilon}_{\widehat{\theta}}) - R_{\phi}({h}^{*}) \right] \underset{N }{\longrightarrow} 0.
\end{equation*}
\end{proposition}

Therefore, the results of Proposition \ref{prop:consistencyPhiRisk} and the calibration property ensure the consistency of the classification procedure
$\overline{g}_{b_{\widehat{\theta}}} $
with respect to the misclassification risk.

\medskip
\begin{theorem}{\label{theo:OneVersusAll}}
Assume that $\Theta = [0,1]^{d}$ and that there exists $\alpha \geq 2$ such that $\Delta = O(N^{-\alpha})$. Under Assumption~1, the classification procedure $\overline{g}_{b_{\widehat{\theta}}}$ given by (1) satisfies,
\begin{equation*}
\widehat{\textbf{E}}\left[R(\overline{g}_{{b}_{\widehat{\theta}}}) - R(g^{*}) \right] \leq O\left(K\sqrt{\dfrac{\alpha d \log(N)}{N}}\right).
\end{equation*}
\end{theorem}
We can note that up to the logarithmic factor, we obtain a rate of convergence of order of $N^{-1/2}$.
Comparing to the rate provided in Corollary~1, this rate is better due to the lower complexity of the parametric model.
Interestingly, if we consider $\displaystyle \widehat{\theta} \in \argmin{\theta \in {\Theta}_{N}}{\widehat{R}(\overline{g}_{b_{{\theta}}})}$ with ${\Theta}_{N}$ a $1/N$-net of $\Theta^{K}$,
from Theorem~1, one can show that the rate of convergence is also of order $N^{-1/2}$. Hence, from a theoretical point of view, the use of convex
surrogate does not degrade the performances of the classification procedure when $\alpha \geq 2$.

Let us describe the models under consideration for our numerical experiments.
We fix $K=3$, $p_{i} = 1/K$ and $\sigma=1$. We consider the following examples:
\begin{enumerate}
\item \textit{Additive OU} $b(\theta, x)=-(x-\theta)$, $x_{0}=4$;
\item \textit{Multiplicative OU} $b(\theta, x)=-\theta x$, $x_{0}=4$;
\item \textit{Polynomial} $b(\theta, x)=-(x-\theta)^{3}-(x+\theta)^{3}$, $x_{0}=4$.
\item \textit{Hyperbolic} $b(\theta, x) = -\theta x / \sqrt{1+x^{2}}$, $x_{0}=4$.
\end{enumerate}

We investigate $K = 3$. Hence in our study the number of class $K$ is not suppose to grow and be large.
We compare the results on the design: $ \theta^{*} = \{1,2,4\}$ for model 1, 2, 4, and $ \theta^{*} = \{1/4,1/2,1\}$ for model 3.
Models 1 and 2 are widely used in practical applications, and they satisfy all the assumptions required for our theoretical results, while the model 3 does not fulfill the Assumption~1, illustrating the robustness of the classification procedures.
Model 4 is known as the hyperbolic model in mathematical finance, and it is used to model log-returns of assets prices in stock markets.

In order to illustrate our convergence results, for each model we provide an evaluation of the misclassification risk of the Bayes rule.
At this end, we repeat $B$ times the following steps:
\begin{enumerate}
\item[i)] simulate a data set $\mathcal{D}_{M}$ with $M = 10000$ and 2500 points for each trajectory,
\item[ii)] based on $\mathcal{D}_{M}$, compute the misclassification error rate of the classifier $\overline{g}_{b_{\theta^{*}}}$.
\end{enumerate}

Finally, we compute the mean and standard deviation of the misclassification risk, the results are reported in Table~\ref{table:bayesRisk} with $B = 100$. One can see that model 1 and model 4 seem to be more tricky for the misclassification risk. This is due to the fact that the classes generated by $\theta_{1}^{*}$ and $\theta_{2}^{*}$ are much overlapped. On the contrary, the classification problem involved by model~2 is more easier although the considered design is the same.

\subsection{Numerical performances of the classification procedures}
\label{subsec:classPROC}

Now, for each model we evaluate the misclassifcation risk of the three classification procedures presented in Section~1.
The procedure based on the contrast estimation is referred as~\texttt{MLE}: $\overline{g}_{\widehat{\theta}}$ with $\widehat{\theta}$ from Eq.~1; the procedure which relies on the constrained method is referred as~\texttt{CM} $\overline{g}_{\widehat{\theta}}$ with $\widehat{\theta}$ given in Eq.~1 (with $\varepsilon=0.01$); while
the procedure based on the one-versus-all strategy is referred as~\texttt{OVA} $\overline{g}_{\widehat{\theta}}$ with $\widehat{\theta}$ given in Eq.~1.
The three procedures rely on an optimization function (in Python or R languages \texttt{optim} is used with argument method \texttt{"BFGS"}). In the case where the \texttt{MLE} is an explicit estimator the procedure is naturally fast.
Other optimization functions could be used to reduce the computational cost of the procedures in other cases.

In order to stress the robustness w.r.t the theoretical conditions between $\Delta, N$, we consider the following asymptotics: $n \in \{50,250\}, ~\Delta= 1/n, ~N \in \{50, 500 \}$.

Table~\ref{tab:error1} provide the mean and standard deviation of the results.
Our main observation is that, except for model~3 with $n=50$, all the classification procedures perform well. Indeed, the evaluation of the misclassifcation risk are closed to the Bayes risk with small variances.
In particular, for $N = 500$, the classification procedures have similar performances. Furthermore, we can see the influence of the sample size
for the procedures \texttt{CM} and \texttt{OVA}.
For instance for model~1 with $n = 50$, the risk of the procedure \texttt{CM} is evaluated
at 0.34 (with standard deviation equal to 0.04) for $N=50$, while it is evaluated at 0.31 (with standard deviation equal to 0.01) for $N=500$.
Interestingly, this is not the case for \texttt{MLE}. Hence, it seems to be preferable to use the classification procedure \texttt{MLE} when the sample size is moderate.
Then, we can see that the parameter $n$ plays a crucial role for model~3. Indeed,
for $n = 50$ all procedures have poor performances while for $n = 250$ the empirical risks are all close to the Bayes classifier.

Regarding the case where the diffusion coefficient is unknown, we believe that things are more intricate. It is well known that the coefficient $\sigma$ can be estimated in the high-frequency scheme of observations. For example estimators studied in may be used. For example a strategy where an estimator of $\sigma$ is plugged in the expressions of $\overline{F}$
could be investigated and one may hope to extend the results of this paper in this new framework. However, the contrast function of the plug-in classification procedure described in Section~1 seems more tricky to study.
\begin{enumerate}
\item[i.] \textit{Additive OU} $b(\theta, x)=-(x-\theta)$, $x_{0}=4$;
\item[ii.] \textit{Multiplicative OU} $b(\theta, x)=-\theta x$, $x_{0}=4$;
\item[iii.] \textit{Polynomial} $b(\theta, x)=-(x-\theta)^{3}-(x+\theta)^{3}$, $x_{0}=4$.
\item[iv.] \textit{Hyperbolic} $b(\theta, x) = -\theta x / \sqrt{1+x^{2}}$, $x_{0}=4$.
\end{enumerate}

\paragraph{The theoretical limit}
Note that for the empirical risk minimization procedure, one may adapt the strategy and circumvent this difficulty by including $(b_{i}/\sigma^{2})_{i\in \mathcal{Y}}$ and $({b_{i}^{2}}/\sigma^{2})_{i\in \mathcal{Y}}$ in the minimization procedure.
In the context where $\sigma$ is unknown, another challenging issue would be to consider the case where classes are discriminated by both the drift and the diffusion coefficients.
Indeed, it seems difficult to adapt directly the previous strategies that rely heavily on Girsanov's formula.
\begin{itemize}
\item \textit{Additive OU} $b(\theta, x)=-(x-\theta)$, $x_{0}=4$;
\item \textit{Multiplicative OU} $b(\theta, x)=-\theta x$, $x_{0}=4$;
\item \textit{Polynomial} $b(\theta, x)=-(x-\theta)^{3}-(x+\theta)^{3}$, $x_{0}=4$.
\item \textit{Hyperbolic} $b(\theta, x) = -\theta x / \sqrt{1+x^{2}}$, $x_{0}=4$.
\end{itemize}

 One may also think of generalizing the initial model.
 In order to cover a broader class of possible applications. A first generalization would be to extend our results to the case of diffusions with inhomogeneous coefficients. We believe our results extend easily to a framework where the classes are discriminated by inhomogeneous drift functions as long as suitable assumptions are made (for e.g. the drift functions are Lipschitz for the space variable uniformly w.r.t the time variable).

\vspace*{-12pt}
\section*{Supplementary Material}

Supplementary material is available online at American Journal of Agricultural Economics online.

\section*{Acknowledgments}

The authors would like to thank Valentine Genon-Catalot for the many fruitful discussions and advice.


\begin{thebibliography}{}

\bibitem[{Allison}(2009)]{hu2017tps3157}
Allison, PD. 2009. \textit{Fixed Effects Regression Models}. Newbury Park, CA: Sage.

\bibitem[{Mazmanian et~al.}(2001)]{Maz01}
American Community Survey. 2013. Census Data. Available at: http://www.census.gov/programs-\break surveys/acs/data/summary-ﬁle.2013.html.

\bibitem[Brody et~al.(2009)]{park2002}
Brody, Samuel D, SammyZahran, Wesley E Highﬁeld, Sarah P Bernhardt, and Arnold Vedlitz. 2009. Policy Learning for Flood Mitigation: A Longitudinal Assessment of Community Rating System in Florida. \textit{Risk Analysis} 29: 912–29.

\vfill\eject

\bibitem[Buntin et~al.(2004)]{park88}
Buntin, Melinda B, and Alan M Zaslavsky. 2004. Too Much Ado about Two-Part Models and Transformation? Comparing Methods of Modeling Medicare Expenditures. \textit{Journal of Health} Economics 23: 525–42.

\bibitem[Flannery(2010)]{smith99}
Flannery, Mark J, and Kristine W Hankins. 2010. Estimating Dynamic Panel Models in Corporate Finance. \textit{Journal of Corporate Finance} 19: 1–19.

\end{thebibliography}

\end{document}
